In [ ]:
import pandas as pd
import sqlalchemy as sa
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
In [ ]:
import plotly.io as pio

pio.templates.default = "seaborn"
pio.renderers.default = "vscode+notebook"
pd.options.display.max_columns = None
# print current renderer

0. Save data to sql database¶

0.1 Create a connection to the postgres server¶

In [ ]:
conn = "postgresql://postgres:pass@localhost/hr_database"

0.2 Save data to sql tables¶

Don't need to do this if data was already imported some other way.

In [ ]:
# employee_df = pd.read_csv("Datasets/employee_survey_data.csv", index_col=0)
# employee_df.to_sql("employee_survey_data", conn, if_exists="replace")
# general_df = pd.read_csv("Datasets/general_data.csv", index_col=8)
# general_df.to_sql("general_data", conn, if_exists="replace")
# manager_df = pd.read_csv("Datasets/manager_survey_data.csv", index_col=0)
# manager_df.to_sql("manager_survey_data", conn, if_exists="replace")

1. Importing the data¶

In [ ]:
employee_df = pd.read_sql_table("employee_survey_data", conn)
general_df = pd.read_sql_table("general_data", conn)
manager_df = pd.read_sql_table("manager_survey_data", conn)

#######################################
# Alternatively import from csv files #
#######################################
# employee_df = pd.read_csv("Datasets/employee_survey_data.csv", index_col=0)
# general_df = pd.read_csv("Datasets/general_data.csv", index_col=8)
# manager_df = pd.read_csv("Datasets/manager_survey_data.csv", index_col=0)
In [ ]:
display(employee_df.head())
display(general_df.head())
display(manager_df.head())
EmployeeID EnvironmentSatisfaction JobSatisfaction WorkLifeBalance
0 1 3.0 4.0 2.0
1 2 3.0 2.0 4.0
2 3 2.0 2.0 1.0
3 4 4.0 4.0 3.0
4 5 4.0 1.0 3.0
EmployeeID Age Attrition BusinessTravel Department DistanceFromHome Education EducationField EmployeeCount Gender JobLevel JobRole MaritalStatus MonthlyIncome NumCompaniesWorked Over18 PercentSalaryHike StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager
0 1 51 No Travel_Rarely Sales 6 2 Life Sciences 1 Female 1 Healthcare Representative Married 131160 1.0 Y 11 8 0 1.0 6 1 0 0
1 2 31 Yes Travel_Frequently Research & Development 10 1 Life Sciences 1 Female 1 Research Scientist Single 41890 0.0 Y 23 8 1 6.0 3 5 1 4
2 3 32 No Travel_Frequently Research & Development 17 4 Other 1 Male 4 Sales Executive Married 193280 1.0 Y 15 8 3 5.0 2 5 0 3
3 4 38 No Non-Travel Research & Development 2 5 Life Sciences 1 Male 3 Human Resources Married 83210 3.0 Y 11 8 3 13.0 5 8 7 5
4 5 32 No Travel_Rarely Research & Development 10 1 Medical 1 Male 1 Sales Executive Single 23420 4.0 Y 12 8 2 9.0 2 6 0 4
EmployeeID JobInvolvement PerformanceRating
0 1 3 3
1 2 2 4
2 3 3 3
3 4 2 3
4 5 3 3

2. Data Preprocessing¶

2.1 Joining the tables¶

In [ ]:
data_df = general_df.merge(employee_df, on="EmployeeID", how="left")
data_df = data_df.merge(manager_df, on="EmployeeID", how="left")

2.2 Adding in and out time data¶

In [ ]:
in_time_df = pd.read_csv("Datasets/in_time.csv")
out_time_df = pd.read_csv("Datasets/out_time.csv")
In [ ]:
# rename first column
in_time_df.rename(columns={"Unnamed: 0": "EmployeeID"}, inplace=True)
# change index to EmployeeID
in_time_df.set_index("EmployeeID", inplace=True)
# change dtype to datetime
in_time_df = in_time_df.apply(pd.to_datetime)

# repeat for out_time_df
out_time_df.rename(columns={"Unnamed: 0": "EmployeeID"}, inplace=True)
out_time_df.set_index("EmployeeID", inplace=True)
out_time_df = out_time_df.apply(pd.to_datetime)
In [ ]:
# create df with in_time and out_time difference
time_diff_df = out_time_df - in_time_df

# calculate average time ignoring NaN values
time_diff_df["AverageTime"] = time_diff_df.mean(axis=1, skipna=True)
# calculate total time ignoring NaN values
time_diff_df["TotalTime"] = time_diff_df.sum(axis=1, skipna=True)
# drop all columns except Average and Total
time_diff_df = time_diff_df[["AverageTime", "TotalTime"]]
# convert to seconds
time_diff_df = time_diff_df.apply(lambda x: x / np.timedelta64(1, "s"))

time_diff_df = time_diff_df.reset_index()
time_diff_df.head()
Out[ ]:
EmployeeID AverageTime TotalTime
0 1 26545.142241 6.185018e+06
1 2 27788.288136 6.585824e+06
2 3 25247.665289 6.135183e+06
3 4 25897.242553 6.111749e+06
4 5 28822.228571 7.090268e+06
In [ ]:
# join time_diff_df with data_df
data_df = data_df.merge(time_diff_df, on="EmployeeID", how="left")
data_df.head()
Out[ ]:
EmployeeID Age Attrition BusinessTravel Department DistanceFromHome Education EducationField EmployeeCount Gender JobLevel JobRole MaritalStatus MonthlyIncome NumCompaniesWorked Over18 PercentSalaryHike StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance JobInvolvement PerformanceRating AverageTime TotalTime
0 1 51 No Travel_Rarely Sales 6 2 Life Sciences 1 Female 1 Healthcare Representative Married 131160 1.0 Y 11 8 0 1.0 6 1 0 0 3.0 4.0 2.0 3 3 26545.142241 6.185018e+06
1 2 31 Yes Travel_Frequently Research & Development 10 1 Life Sciences 1 Female 1 Research Scientist Single 41890 0.0 Y 23 8 1 6.0 3 5 1 4 3.0 2.0 4.0 2 4 27788.288136 6.585824e+06
2 3 32 No Travel_Frequently Research & Development 17 4 Other 1 Male 4 Sales Executive Married 193280 1.0 Y 15 8 3 5.0 2 5 0 3 2.0 2.0 1.0 3 3 25247.665289 6.135183e+06
3 4 38 No Non-Travel Research & Development 2 5 Life Sciences 1 Male 3 Human Resources Married 83210 3.0 Y 11 8 3 13.0 5 8 7 5 4.0 4.0 3.0 2 3 25897.242553 6.111749e+06
4 5 32 No Travel_Rarely Research & Development 10 1 Medical 1 Male 1 Sales Executive Single 23420 4.0 Y 12 8 2 9.0 2 6 0 4 4.0 1.0 3.0 3 3 28822.228571 7.090268e+06
In [ ]:
# change column dtype to str
data_df.columns = data_df.columns.astype(str)

2.2 Checking for null values¶

We will check for any null values, and decide if we want to drop them or impute them.

In [ ]:
num_null_rows = np.count_nonzero(data_df.isna())
print(f"Number of rows with null values: {num_null_rows}")
print(f"Percent of rows with null values: {num_null_rows/data_df.shape[0]*100:.3f}%")
Number of rows with null values: 111
Percent of rows with null values: 2.517%

There are not very many null values so we can safely drop them without affecting the final analysis.

In [ ]:
# drop rows with missing values
data_df_no_na = data_df.dropna()

2.3 Checking for duplicate values¶

In [ ]:
print(f"Duplicate Rows: {data_df_no_na.duplicated().sum()}")
Duplicate Rows: 0

There are no duplicate rows in the dataset.

3. Exploratory Data Analysis¶

The variables in the employee and manager survey are pretty self explanatory.


For now lets focus on the general data.
We will divide the variables to numerical and categorical.
In [ ]:
ignored_columns = [
    "EmployeeID",
]

numerical_columns = [
    # "EmployeeID",
    "Age",
    # "Attrition",
    # "BusinessTravel",
    # "Department",
    "DistanceFromHome",
    # "Education",
    # "EducationField",
    "EmployeeCount",
    # "Gender",
    # "JobLevel",
    # "JobRole",
    # "MaritalStatus",
    "MonthlyIncome",
    "NumCompaniesWorked",
    # "Over18",
    "PercentSalaryHike",
    "StandardHours",
    "StockOptionLevel",
    "TotalWorkingYears",
    "TrainingTimesLastYear",
    "YearsAtCompany",
    "YearsSinceLastPromotion",
    "YearsWithCurrManager",
    # 'EnvironmentSatisfaction',
    # 'JobSatisfaction',
    # 'WorkLifeBalance',
    # 'JobInvolvement',
    # 'PerformanceRating'
    "AverageTime",
    "TotalTime",
]

categorical_columns = [
    var
    for var in data_df_no_na.columns
    if var not in numerical_columns and var not in ignored_columns
]

# # make categorical columns into category type
# for col in categorical_columns:
#     data_df[col] = data_df[col].astype("category")

Create functions to visualise numerical data and categorical data

In [ ]:
def numerical_vis(data, variable):
    from plotly.subplots import make_subplots

    hist_fig = px.histogram(data, x=variable, nbins=20)

    box_fig = px.box(data, y=variable)

    final_fig = make_subplots(rows=1, cols=2)
    final_fig.add_trace(hist_fig.data[0], row=1, col=1)
    final_fig.add_trace(box_fig.data[0], row=1, col=2)

    # title
    final_fig.update_layout(title_text=f"Histogram and Boxplot of {variable}")

    final_fig.show()
    # fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    # # hist plot
    # sns.histplot(x=data[variable], ax=ax[0], kde=True)
    # # box plot
    # sns.boxplot(x=data[variable], ax=ax[1])
    # # titles
    # ax[0].set_title(f"Histogram of {variable}")
    # ax[1].set_title(f"Boxplot of {variable}")
    # plt.show()


def categorical_vis(data, variable):
    # fig, ax = plt.subplots(1, 2, figsize=(12, 6))

    # # pie plot
    # data[variable].value_counts().plot(kind="pie", ax=ax[0], autopct="%1.1f%%")

    # # bar chart
    # sns.countplot(x=data[variable], ax=ax[1])
    # # titles
    # ax[0].set_title(f"Pie chart of {variable}")
    # ax[1].set_title(f"Bar chart of {variable}")
    pie_fig = px.pie(data, names=variable)

    bar_fig = px.histogram(data, x=variable, color=variable)

    final_fig = make_subplots(rows=1, cols=2, specs=[[{"type": "pie"}, {"type": "xy"}]])
    final_fig.add_trace(pie_fig.data[0], row=1, col=1)

    for index in range(len(bar_fig.data)):
        final_fig.add_trace(bar_fig.data[index], row=1, col=2)
    # barmode
    final_fig.update_layout(barmode="stack")
    # bargap
    final_fig.update_layout(bargap=0.1)
    # title
    final_fig.update_layout(title_text=f"Distribution of {variable}")
    # center title
    final_fig.update_layout(title_x=0.5)

    # hide legend
    final_fig.update_layout(showlegend=False)
    # title font size
    final_fig.update_layout(title_font_size=20)

    final_fig.show()

3.1 Overview of Numerical variables¶

In [ ]:
for var in numerical_columns:
    numerical_vis(data_df_no_na, var)

From the plots we can see a quick overview of the distribution of the variables.


The main thing we notice here is that StandardHours and EmployeeCount only have 1 value. As such they likely have no effect on Attrition, and we can safely drop these columns.
In [ ]:
data_df_no_na = data_df_no_na.drop(columns=["EmployeeCount", "StandardHours"])
# remove columns from numerical columns
numerical_columns.remove("EmployeeCount")
numerical_columns.remove("StandardHours")

data_df_no_na.head()
Out[ ]:
EmployeeID Age Attrition BusinessTravel Department DistanceFromHome Education EducationField Gender JobLevel JobRole MaritalStatus MonthlyIncome NumCompaniesWorked Over18 PercentSalaryHike StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance JobInvolvement PerformanceRating AverageTime TotalTime
0 1 51 No Travel_Rarely Sales 6 2 Life Sciences Female 1 Healthcare Representative Married 131160 1.0 Y 11 0 1.0 6 1 0 0 3.0 4.0 2.0 3 3 26545.142241 6.185018e+06
1 2 31 Yes Travel_Frequently Research & Development 10 1 Life Sciences Female 1 Research Scientist Single 41890 0.0 Y 23 1 6.0 3 5 1 4 3.0 2.0 4.0 2 4 27788.288136 6.585824e+06
2 3 32 No Travel_Frequently Research & Development 17 4 Other Male 4 Sales Executive Married 193280 1.0 Y 15 3 5.0 2 5 0 3 2.0 2.0 1.0 3 3 25247.665289 6.135183e+06
3 4 38 No Non-Travel Research & Development 2 5 Life Sciences Male 3 Human Resources Married 83210 3.0 Y 11 3 13.0 5 8 7 5 4.0 4.0 3.0 2 3 25897.242553 6.111749e+06
4 5 32 No Travel_Rarely Research & Development 10 1 Medical Male 1 Sales Executive Single 23420 4.0 Y 12 2 9.0 2 6 0 4 4.0 1.0 3.0 3 3 28822.228571 7.090268e+06

3.2 Overview of categorical variables¶

In [ ]:
for var in categorical_columns:
    categorical_vis(data_df_no_na, var)

The distribution of the categorical variables can be seen above.


All of the workers seem to be over 18, so this metric will not have an affect on Attrition. We can safely drop this column too.
In [ ]:
data_df_no_na = data_df_no_na.drop(columns=["Over18"])
# remove columns from categorical columns
categorical_columns.remove("Over18")

3.3 Data Correlation¶

We will change binary categorical variables to 1, 0 and ordered categorical data to numbers

In [ ]:
# ordered categorical data to numerical
data_df_no_na["Attrition"] = data_df_no_na["Attrition"].map({"Yes": 1, "No": 0})
# data_df["Over18"] = data_df["Over18"].map({"Y": 1, "N": 0})
data_df_no_na["BusinessTravel"] = data_df_no_na["BusinessTravel"].map(
    {"Non-Travel": 0, "Travel_Rarely": 1, "Travel_Frequently": 2}
)
data_df_no_na["BusinessTravel"].value_counts()
data_df_no_na.head()
Out[ ]:
EmployeeID Age Attrition BusinessTravel Department DistanceFromHome Education EducationField Gender JobLevel JobRole MaritalStatus MonthlyIncome NumCompaniesWorked PercentSalaryHike StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance JobInvolvement PerformanceRating AverageTime TotalTime
0 1 51 0 1 Sales 6 2 Life Sciences Female 1 Healthcare Representative Married 131160 1.0 11 0 1.0 6 1 0 0 3.0 4.0 2.0 3 3 26545.142241 6.185018e+06
1 2 31 1 2 Research & Development 10 1 Life Sciences Female 1 Research Scientist Single 41890 0.0 23 1 6.0 3 5 1 4 3.0 2.0 4.0 2 4 27788.288136 6.585824e+06
2 3 32 0 2 Research & Development 17 4 Other Male 4 Sales Executive Married 193280 1.0 15 3 5.0 2 5 0 3 2.0 2.0 1.0 3 3 25247.665289 6.135183e+06
3 4 38 0 0 Research & Development 2 5 Life Sciences Male 3 Human Resources Married 83210 3.0 11 3 13.0 5 8 7 5 4.0 4.0 3.0 2 3 25897.242553 6.111749e+06
4 5 32 0 1 Research & Development 10 1 Medical Male 1 Sales Executive Single 23420 4.0 12 2 9.0 2 6 0 4 4.0 1.0 3.0 3 3 28822.228571 7.090268e+06
In [ ]:
# check correlation
corr = data_df_no_na.corr()
px.imshow(
    corr,
    height=1200,
    width=1200,
    # min value of the color scale -1
    zmin=-1,
    # max value of the color scale 1
    zmax=1,
    # title
    title="Correlation Matrix",
    # color scale
    color_continuous_scale=px.colors.diverging.RdYlGn,
)
/tmp/ipykernel_473790/249899736.py:2: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

3.4 Visualise relationship of each variable to attrition¶

In [ ]:
def compare_categorical_vis(data, variable):
    fig = px.histogram(
        data,
        x=variable,
        color="Attrition",
        barmode="overlay",
        title=f"Histogram of {variable} by Attrition",
    )
    # bargap
    fig.update_layout(bargap=0.1)
    fig.show()


def compare_numerical_vis(data, variable):
    hist_fig = px.histogram(data, x=variable, color="Attrition")

    box_fig = px.box(data, x="Attrition", y=variable, color="Attrition")

    final_fig = make_subplots(rows=1, cols=2)
    final_fig.add_trace(hist_fig.data[0], row=1, col=1)
    final_fig.add_trace(hist_fig.data[1], row=1, col=1)

    final_fig.add_trace(box_fig.data[0], row=1, col=2)
    final_fig.add_trace(box_fig.data[1], row=1, col=2)
    # hide legend
    # final_fig.update_layout(showlegend=False)
    # title
    final_fig.update_layout(title_text=f"Attrition by {variable}")
    # title center
    final_fig.update_layout(title_x=0.5)
    # barmode
    final_fig.update_layout(barmode="overlay")
    final_fig.show()

Visualuse numerical variables¶

In [ ]:
for var in numerical_columns:
    compare_numerical_vis(data_df_no_na, var)

Visualuse categorical variables¶

In [ ]:
for var in categorical_columns:
    compare_categorical_vis(data_df_no_na, var)

4. Predicting Attrition¶

4.1 Preparing data¶

In [ ]:
# quick look at the data
data_df_no_na.head()
Out[ ]:
EmployeeID Age Attrition BusinessTravel Department DistanceFromHome Education EducationField Gender JobLevel JobRole MaritalStatus MonthlyIncome NumCompaniesWorked PercentSalaryHike StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance JobInvolvement PerformanceRating AverageTime TotalTime
0 1 51 0 1 Sales 6 2 Life Sciences Female 1 Healthcare Representative Married 131160 1.0 11 0 1.0 6 1 0 0 3.0 4.0 2.0 3 3 26545.142241 6.185018e+06
1 2 31 1 2 Research & Development 10 1 Life Sciences Female 1 Research Scientist Single 41890 0.0 23 1 6.0 3 5 1 4 3.0 2.0 4.0 2 4 27788.288136 6.585824e+06
2 3 32 0 2 Research & Development 17 4 Other Male 4 Sales Executive Married 193280 1.0 15 3 5.0 2 5 0 3 2.0 2.0 1.0 3 3 25247.665289 6.135183e+06
3 4 38 0 0 Research & Development 2 5 Life Sciences Male 3 Human Resources Married 83210 3.0 11 3 13.0 5 8 7 5 4.0 4.0 3.0 2 3 25897.242553 6.111749e+06
4 5 32 0 1 Research & Development 10 1 Medical Male 1 Sales Executive Single 23420 4.0 12 2 9.0 2 6 0 4 4.0 1.0 3.0 3 3 28822.228571 7.090268e+06

Employee ID is something that doesnt affect attrition, so we will drop it.

In [ ]:
# drop employee id
data_df_no_na = data_df_no_na.drop(columns=["EmployeeID"])

Next we will encode the categorical variables.

In [ ]:
data_df_numerical = pd.get_dummies(data_df_no_na)
data_df_numerical.head()
Out[ ]:
Age Attrition BusinessTravel DistanceFromHome Education JobLevel MonthlyIncome NumCompaniesWorked PercentSalaryHike StockOptionLevel TotalWorkingYears TrainingTimesLastYear YearsAtCompany YearsSinceLastPromotion YearsWithCurrManager EnvironmentSatisfaction JobSatisfaction WorkLifeBalance JobInvolvement PerformanceRating AverageTime TotalTime Department_Human Resources Department_Research & Development Department_Sales EducationField_Human Resources EducationField_Life Sciences EducationField_Marketing EducationField_Medical EducationField_Other EducationField_Technical Degree Gender_Female Gender_Male JobRole_Healthcare Representative JobRole_Human Resources JobRole_Laboratory Technician JobRole_Manager JobRole_Manufacturing Director JobRole_Research Director JobRole_Research Scientist JobRole_Sales Executive JobRole_Sales Representative MaritalStatus_Divorced MaritalStatus_Married MaritalStatus_Single
0 51 0 1 6 2 1 131160 1.0 11 0 1.0 6 1 0 0 3.0 4.0 2.0 3 3 26545.142241 6.185018e+06 0 0 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0
1 31 1 2 10 1 1 41890 0.0 23 1 6.0 3 5 1 4 3.0 2.0 4.0 2 4 27788.288136 6.585824e+06 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1
2 32 0 2 17 4 4 193280 1.0 15 3 5.0 2 5 0 3 2.0 2.0 1.0 3 3 25247.665289 6.135183e+06 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0
3 38 0 0 2 5 3 83210 3.0 11 3 13.0 5 8 7 5 4.0 4.0 3.0 2 3 25897.242553 6.111749e+06 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0
4 32 0 1 10 1 1 23420 4.0 12 2 9.0 2 6 0 4 4.0 1.0 3.0 3 3 28822.228571 7.090268e+06 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1

4.2 Splitting the data¶

We will split the data into train and test sets.

In [ ]:
from sklearn.model_selection import train_test_split

# get x and y from data
X = data_df_numerical.drop(columns=["Attrition"])
y = data_df_numerical["Attrition"]

# # moved this to the top
# # column names are currently sqlalchemy quoted_name
# # change to string
# X.columns = X.columns.astype(str)

# 80 20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

4.3 Training the model¶

In [ ]:
from sklearn.metrics import accuracy_score, confusion_matrix


def train_model(model_class, Xtrain, Xtest, Ytrain, Ytest, **kwargs):
    model = model_class(**kwargs)
    model.fit(Xtrain, Ytrain)
    y_pred = model.predict(Xtest)

    rounder = lambda x: 1 if x > 0.5 else 0
    y_pred = np.array([rounder(x) for x in y_pred])

    acc = accuracy_score(Ytest, y_pred)
    cm = confusion_matrix(Ytest, y_pred)

    fig = px.imshow(
        cm,
        height=500,
        width=500,
        labels={"color": "Count", "x": "Predicted", "y": "Actual"},
    )
    # title
    fig.update_layout(
        title_text=f"Confusion Matrix of {model_class.__name__}\nAccuracy: {acc:.3f}"
    )
    # title center
    fig.update_layout(title_x=0.5)

    # x axis label
    fig.update_xaxes(title_text="Predicted")
    # y axis label
    fig.update_yaxes(title_text="Actual")
    fig.show()
    return model
In [ ]:
from sklearn.linear_model import LogisticRegression, LinearRegression, Perceptron
from sklearn.neighbors import KNeighborsClassifier

logistic_model = train_model(LogisticRegression, X_train, X_test, y_train, y_test)
linear_model = train_model(LinearRegression, X_train, X_test, y_train, y_test)
knn_model = train_model(
    KNeighborsClassifier, X_train, X_test, y_train, y_test, n_neighbors=5
)

All the models have an accuracy of around 80% This seems good at first glance, but lets take a look at the predictions to see whats going on.

In [ ]:
# logistic regression
y_pred = logistic_model.predict(X_test)
y_pred = [1 if x > 0.5 else 0 for x in y_pred]
print("LogisticRegression")
print(pd.Series(y_pred).value_counts())

# linear regression
y_pred = linear_model.predict(X_test)
y_pred = [1 if x > 0.5 else 0 for x in y_pred]
print("LinearRegression")
print(pd.Series(y_pred).value_counts())

# knn
y_pred = knn_model.predict(X_test)
print("KNeighborsClassifier")
print(pd.Series(y_pred).value_counts())
LogisticRegression
0    860
dtype: int64
LinearRegression
0    848
1     12
dtype: int64
KNeighborsClassifier
0    810
1     50
dtype: int64

As we can see, most of the models predict that the employee will not leave the company. And because we have around 80% people that do not leave the company, the model gets a decent accuracy score.

4.4 Trying a different model¶

In [ ]:
# xgboost classifier
from xgboost import XGBClassifier

xgb_model = train_model(XGBClassifier, X_train, X_test, y_train, y_test)

This classifier already has a much better accuracy score than the previous models. Let us calculate other metrics to see how it does.

In [ ]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    classification_report,
    confusion_matrix,
)

# print the accuracy, precision, sensitivity, and specificity
def print_metrics(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    sens = tp / (tp + fn)
    spec = tn / (tn + fp)
    # define accuracy in binary classification

    print(f"Accuracy: {acc:.3f}")
    print(f"Precision: {prec:.3f}")
    print(f"Sensitivity: {sens:.3f}")
    print(f"Specificity: {spec:.3f}")


print_metrics(y_test, xgb_model.predict(X_test))
Accuracy: 0.995
Precision: 1.000
Sensitivity: 0.971
Specificity: 1.000

Above,
Accuracy is the proportion of correct predictions
Precision is the proportion of positive identifications which were actually correct
Sensitivity is the proportion of actual positives which were identified correctly
Specificity is the proportion of actual negatives which were identified correctly
This model has good scores for all the relevant metrics.


We can now use this model to predict if an employee will leave the company or not. But the original dataset has a lot of variables, and we need all of these to predict if an employee will leave or not.


So we will use a feature importance method to select the most important variables and train a new model with a smaller number of variables. If this new model has a similar accuracy score, then we can use it for the predictions, with much simpler input data.
In [ ]:
feature_importance_df = pd.DataFrame(
    [xgb_model.feature_names_in_, xgb_model.feature_importances_]
).T
# rename columns
feature_importance_df.columns = ["feature", "importance"]
# sort by importance
feature_importance_df = feature_importance_df.sort_values(
    by="importance", ascending=False
)

# plot bar
fig = px.bar(feature_importance_df, x="importance", y="feature", orientation="h")
fig.show()

So the most important features (top 10) are:


  • EducationField

  • JobRole
  • MaritalStatus

  • TotalWorkingYears
  • EnvironmentSatisfaction
  • YearsSinceLastPromotion
  • BusinessTravel
  • Department

4.5 Training a new model with the most important features¶

Lets read in the data again, apply all the preprocessing steps, and train a new model with the most important features.

In [ ]:
selected_features = [
    "EducationField",
    "JobRole",
    "MaritalStatus",
    "TotalWorkingYears",
    "EnvironmentSatisfaction",
    "YearsSinceLastPromotion",
    "BusinessTravel",
    "Department",
]
data_df_small = data_df_no_na.copy()
# only keep selected features
data_df_small = data_df_small[selected_features + ["Attrition"]]
# get dummies
data_df_small = pd.get_dummies(data_df_small)

Xsmall = data_df_small.drop(columns=["Attrition"])
ysmall = data_df_small["Attrition"]

Xsmall_train, Xsmall_test, ysmall_train, ysmall_test = train_test_split(
    Xsmall, ysmall, test_size=0.2, stratify=ysmall, random_state=42
)

xgb_small_model = train_model(
    XGBClassifier, Xsmall_train, Xsmall_test, ysmall_train, ysmall_test
)
print_metrics(ysmall_test, xgb_small_model.predict(Xsmall_test))
Accuracy: 0.977
Precision: 0.961
Sensitivity: 0.892
Specificity: 0.993

The metrics for this model are still pretty good.


So we can use this model for the predictions.
In [ ]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

def create_encoders(dataframe :pd.DataFrame):
    transformer = make_column_transformer(
        (OneHotEncoder(), ["EducationField", "JobRole", "MaritalStatus", "BusinessTravel", "Department"]),
        remainder="passthrough",
    )
    transformed = transformer.fit_transform(dataframe)
    # transformed_df = pd.DataFrame(transformed)
    transformed_df:pd.DataFrame = pd.DataFrame.sparse.from_spmatrix(transformed)
    transformed_df.columns = transformer.get_feature_names_out()
    return transformer, transformed_df


def train_predicting_model():
    input_df = data_df_no_na.copy()
    # only select features
    final_X = input_df[selected_features]
    final_y = input_df["Attrition"]
    transformer, final_X = create_encoders(final_X)

    # train
    final_model = XGBClassifier()
    final_model.fit(final_X, final_y)
    return transformer, final_model



    # return final_df

final_transformer, final_model = train_predicting_model()

def get_prediction(
    education_field:str,
    job_role:str,
    marital_status:str,
    total_working_years:str,
    environment_satisfaction:str,
    years_since_last_promotion:str,
    business_travel:str,
    department:str,
):
    # create a dataframe
    input_df = pd.DataFrame(
        {
            "EducationField": [education_field],
            "JobRole": [job_role],
            "MaritalStatus": [marital_status],
            "TotalWorkingYears": [total_working_years],
            "EnvironmentSatisfaction": [environment_satisfaction],
            "YearsSinceLastPromotion": [years_since_last_promotion],
            "BusinessTravel": [business_travel],
            "Department": [department],
        }
    )
    # transform
    transformed = final_transformer.transform(input_df)
    # transformed_df = pd.DataFrame(transformed)
    transformed_df:pd.DataFrame = pd.DataFrame.sparse.from_spmatrix(transformed)
    transformed_df.columns = final_transformer.get_feature_names_out()
    # predict
    y_pred = final_model.predict(transformed_df)
    return y_pred[0]



row = data_df_no_na.sample()[selected_features]
pred = get_prediction(
    education_field=row["EducationField"].values[0],
    job_role=row["JobRole"].values[0],
    marital_status=row["MaritalStatus"].values[0],
    total_working_years=row["TotalWorkingYears"].values[0],
    environment_satisfaction=row["EnvironmentSatisfaction"].values[0],
    years_since_last_promotion=row["YearsSinceLastPromotion"].values[0],
    business_travel=row["BusinessTravel"].values[0],
    department=row["Department"].values[0],
)

print(pred)
0